IE360 Project
1. Introduction
First we will start by making necessary arrangements in the data files and check the summary.
require(rmdformats)
require(data.table)
require(skimr)
require(ggcorrplot)
require(lubridate)
require(forecast)
require(tseries)
require(urca)
production_path <- "C:/Users/Asus/Desktop/2022 - Spring/IE360/Project/2022-06-06_production.csv"
weather_path <- "C:/Users/Asus/Desktop/2022 - Spring/IE360/Project/2022-06-06_weather.csv"
prod <- fread(production_path)
long_weather <- fread(weather_path)
prod2 <- copy(prod)
threedays=c("2022-06-05","2022-06-06","2022-06-07")
tmpprod=matrix(0,nrow=72,ncol=3)
tmpprod[,1]=c(rep(threedays[1],24),rep(threedays[2],24),rep(threedays[3],24))
tmpprod[,2]=rep(0:23,3)
tmpprod[,3]=rep(NA,72)
tmpprod=data.table(tmpprod)
setnames(tmpprod,"V1","date")
setnames(tmpprod,"V2","hour")
setnames(tmpprod,"V3","production")
tmpprod[,date:=as.IDate(date)]
tmpprod[,hour:=as.integer(hour)]
tmpprod[,production:=as.double(production)]
prod=rbind(prod,tmpprod)
prod[,dateTime := ymd(date) + dhours(hour)]
prod = prod[order(dateTime)]
head(prod)## date hour production dateTime
## 1: 2021-02-01 0 0 2021-02-01 00:00:00
## 2: 2021-02-01 1 0 2021-02-01 01:00:00
## 3: 2021-02-01 2 0 2021-02-01 02:00:00
## 4: 2021-02-01 3 0 2021-02-01 03:00:00
## 5: 2021-02-01 4 0 2021-02-01 04:00:00
## 6: 2021-02-01 5 0 2021-02-01 05:00:00
Our data set involves date and production values for a period of 2 years. Below, some graphics are printed to see which relations to investigate through project and build model on.
ggplot(prod,aes(x=dateTime,y=production)) + geom_line() + ggtitle("Hourly Production Values")+xlab("Date")+ylab("Production")# first week of the given data
ggplot(prod[date <= "2021-02-08" & date >= "2021-02-01"],aes(x=dateTime,y=production)) + geom_line(color="blue") + ggtitle("Hourly Production Values In The First Week")+xlab("Date")+ylab("Production")# daily data
daily_series <- prod[,list(daily_production = mean(production)),by= list(date)]
daily_series=daily_series[!is.na(daily_production)]
ggplot(daily_series,aes(x=date,y=daily_production)) + geom_line(color="red") + ggtitle("Daily Production Values")+xlab("Date")+ylab("Production")# monthly data
prod$month = month(prod$date)
prod$year = year(prod$date)
monthly_production <- prod[,list(monthly_production = mean(production)),by= list(month,year)]
monthly_production[,date := as.Date(paste(year,"-",month,"-01",sep=""), format='%Y-%m-%d')]
ggplot(monthly_production,aes(x=date,y=monthly_production)) + geom_line(color="darkgreen") + ggtitle("Monthly Production Values")+xlab("Date")+ylab("Production")As for the next steps, the time series will be tried to modeled in the sense of different predictors while trying different models in each step to describe the solar power.
3. Approach
3.1 Trend and Seasonality in Daily Production
daily_series[,monthFactor := as.factor(format(daily_series$date, "%m"))]
daily_series[,trend := 1: .N]
# model with trend
Model_trend<- lm(daily_production ~ trend, daily_series)
summary(Model_trend)##
## Call:
## lm(formula = daily_production ~ trend, data = daily_series)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9336 -3.5487 0.7216 4.0270 7.3116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.438243 0.432192 21.838 < 2e-16 ***
## trend 0.005190 0.001547 3.354 0.000859 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.742 on 481 degrees of freedom
## Multiple R-squared: 0.02285, Adjusted R-squared: 0.02082
## F-statistic: 11.25 on 1 and 481 DF, p-value: 0.0008593
daily_series[,trend_const := predict(Model_trend,daily_series)]
ggplot(daily_series,aes(x=date)) + geom_line(aes(y=daily_production, color="real")) + geom_line(aes(y=trend_const, color="trend_const")) + ggtitle("Daily Production Values vs. Trend")+xlab("Date")+ylab("Production")# residuals
ggplot(daily_series,aes(x=date)) + geom_line(aes(y=daily_production - trend_const, color="residual")) + ggtitle("Difference Between Daily Production Values and Trend Constant")+xlab("Date")+ylab("Residual")checkresiduals(Model_trend$residuals)There is seen an increasing trend, when we compare the same values for January-April for the years 2021 and 2022. Also the increase of the variance can be inspected visually as the time passes by. Model constructed with trend component shows that the daily production values are associated with more than just a trend component. By looking at the residual analysis, we can conclude that residuals are clearly not independent and don’t have constant variance. Moreover it can be seen that, while winter month production values are well below the trend line, summer months are larger than the trend. This hints a seasonality component affecting the production values.
# add seasonality component
Model_trend_seasonality <- lm(daily_production ~ monthFactor + trend, daily_series)
summary(Model_trend_seasonality)##
## Call:
## lm(formula = daily_production ~ monthFactor + trend, data = daily_series)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8455 -1.5516 0.4299 1.7190 5.2906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6091827 0.5730584 4.553 6.74e-06 ***
## monthFactor02 0.9161069 0.6089805 1.504 0.133
## monthFactor03 2.9727892 0.5927610 5.015 7.52e-07 ***
## monthFactor04 5.4486935 0.5913087 9.215 < 2e-16 ***
## monthFactor05 7.5669448 0.5850135 12.935 < 2e-16 ***
## monthFactor06 11.0004509 0.6721260 16.367 < 2e-16 ***
## monthFactor07 12.0013866 0.6883168 17.436 < 2e-16 ***
## monthFactor08 11.2879195 0.6992667 16.143 < 2e-16 ***
## monthFactor09 9.3786627 0.6830585 13.730 < 2e-16 ***
## monthFactor10 7.1358314 0.6738432 10.590 < 2e-16 ***
## monthFactor11 2.8192223 0.6763830 4.168 3.66e-05 ***
## monthFactor12 -0.7957239 0.6693514 -1.189 0.235
## trend 0.0106919 0.0009041 11.826 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.589 on 470 degrees of freedom
## Multiple R-squared: 0.7154, Adjusted R-squared: 0.7081
## F-statistic: 98.46 on 12 and 470 DF, p-value: < 2.2e-16
daily_series[,trend_Seasonality := predict(Model_trend_seasonality,daily_series)]
ggplot(daily_series,aes(x=date)) + geom_line(aes(y=daily_production, color="real")) + geom_line(aes(y=trend_Seasonality, color="trend_Seasonality")) + ggtitle("Daily Production Values vs. Trend+Seasonality")+xlab("Date")+ylab("Production")# residuals
ggplot(daily_series,aes(x=date)) + geom_line(aes(y=daily_production - trend_Seasonality, color="residual2")) + ggtitle("Difference Between Daily Production Values and Trend+Seasonality Factor")+xlab("Date")+ylab("Production")checkresiduals(Model_trend_seasonality$residuals)
By adding seasonality component, the level of the production values are
captured better. Adding the seasonality gave a better model, however
residuals still don’t have constant variance, high values in
autocorrelation and not normally distributed. Furthermore, other
variables still need to be considered, such as weather values.
3.2 Time Series Analysis
adf.test(daily_series$daily_production)##
## Augmented Dickey-Fuller Test
##
## data: daily_series$daily_production
## Dickey-Fuller = -2.194, Lag order = 7, p-value = 0.4961
## alternative hypothesis: stationary
# daily time series
daily_ts <- ts(daily_series$daily_production, start = c(2021,02,01), frequency = 365)
rootTest=ur.kpss(daily_ts)
summary(rootTest)##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.973
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
### time plot
autoplot(daily_ts) +
ggtitle("Time Series: Daily Production") +
ylab("Production in given units")### take the first difference
ddaily_ts <- diff(daily_ts)
### time plot
autoplot(ddaily_ts) +
ggtitle("Time Series: Change in Daily Production") +
ylab("Production in given units")### seasonality investigation
ggseasonplot(ddaily_ts) +
ggtitle("Season Plot: Change in Daily Production") +
ylab("Production in given units")ggseasonplot(ts(monthly_production$monthly_production[1:15], start = c(2021,02), frequency = 12)) + ggtitle("Season Plot: Monthly Production")+xlab("Date")+ylab("Production")Other than that, comparing the monthly trend of the beginning of years 2021 and 2022, one can say that a similarity is seen.
#Time Series Decomposition
acf(daily_ts[1:length(daily_ts)])pacf(daily_ts[1:length(daily_ts)])dailyts <- ts(daily_ts, freq=15)
daily_ts_additive<-decompose(dailyts, type="additive")
plot(daily_ts_additive) daily_ts_multip<-decompose(dailyts, type="multiplicative")
plot(daily_ts_multip)When examined the PACF graph for time series analysis where effects between the lags are extracted, it can be seen that last 3 days have a significant effect on the current day’s value.
3.3 External Variables, Correlation and Regression
# long to wide
wide_weather<- dcast(long_weather, date + hour~ variable + lat + lon , value.var = c("value"))
# daily weather
daily_weather <- dcast(long_weather, date ~ variable + lat + lon, value.var="value", fun.aggregate=mean)
daily_weather[,residual_model2 := c(daily_series$daily_production - daily_series$trend_Seasonality, rep(NA,13))]
correl_info=cor(daily_weather[1:nrow(daily_weather),2:37])
cbind(cor(cbind(data.table(daily_series$daily_production),daily_weather[1:454,2:37]))["V1",])## [,1]
## V1 1.0000000
## CLOUD_LOW_LAYER_36.25_33 -0.3661177
## CLOUD_LOW_LAYER_36.25_33.25 -0.3535146
## CLOUD_LOW_LAYER_36.25_33.5 -0.3451834
## CLOUD_LOW_LAYER_36.5_33 -0.3028587
## CLOUD_LOW_LAYER_36.5_33.25 -0.2875060
## CLOUD_LOW_LAYER_36.5_33.5 -0.2999016
## CLOUD_LOW_LAYER_36.75_33 -0.3487160
## CLOUD_LOW_LAYER_36.75_33.25 -0.3145881
## CLOUD_LOW_LAYER_36.75_33.5 -0.3231557
## DSWRF_36.25_33 0.5845948
## DSWRF_36.25_33.25 0.5891529
## DSWRF_36.25_33.5 0.5961929
## DSWRF_36.5_33 0.5743597
## DSWRF_36.5_33.25 0.5806802
## DSWRF_36.5_33.5 0.5942779
## DSWRF_36.75_33 0.5851839
## DSWRF_36.75_33.25 0.5888127
## DSWRF_36.75_33.5 0.5900025
## REL_HUMIDITY_36.25_33 -0.4816100
## REL_HUMIDITY_36.25_33.25 -0.4555036
## REL_HUMIDITY_36.25_33.5 -0.4160707
## REL_HUMIDITY_36.5_33 -0.4970045
## REL_HUMIDITY_36.5_33.25 -0.5231495
## REL_HUMIDITY_36.5_33.5 -0.5280639
## REL_HUMIDITY_36.75_33 -0.4985260
## REL_HUMIDITY_36.75_33.25 -0.5227333
## REL_HUMIDITY_36.75_33.5 -0.4952053
## TEMP_36.25_33 0.6005373
## TEMP_36.25_33.25 0.6054398
## TEMP_36.25_33.5 0.6013787
## TEMP_36.5_33 0.5782878
## TEMP_36.5_33.25 0.5907519
## TEMP_36.5_33.5 0.5992751
## TEMP_36.75_33 0.5813030
## TEMP_36.75_33.25 0.5932988
## TEMP_36.75_33.5 0.5806460
Most of the variables have a correlation value around 0.5 depending on the coordinate, however none of them have a high correlation value itself. That’s why the mean of locations will be taken into account to build a model and the significance of their effect will be discussed based on the model.
temp=wide_weather[date<="2022-06-06"]
rownum=nrow(prod)/24
x1=0
for(i in 1:(rownum*24)){
x1[i]=mean(as.matrix(temp[i,2:10]))
}
x2=0
for(i in 1:(rownum*24)){
x2[i]=mean(as.matrix(temp[i,11:19]))
}
x3=0
for(i in 1:(rownum*24)){
x3[i]=mean(as.matrix(temp[i,20:28]))
}
x4=0
for(i in 1:(rownum*24)){
x4[i]=mean(as.matrix(temp[i,29:37]))
}
prod[,trend:=1:.N]
prod[,m_sf:=x1]
prod[,m_hum:=x2]
prod[,m_cl:=x3]
prod[,m_tmp:=x4]
library(corrplot)
library(RColorBrewer)
?data.table
cR <- prod[,c(3,8,9,10,11)]
CorrGraph <- as.data.frame(cR)
corr <- cor(CorrGraph, use = "pairwise.complete.obs")
ggcorrplot(corr)
As seen in the above figure, humidity level and cloud layers have a
stronger correlation with production values more than other variables.
So we expect to use these variables in our model, however it will be
better to analyze while building the model.
capacity=matrix(0,nrow=(nrow(prod)/24)-30,24)
for(i in 0:23){
temp_prod=prod[hour==i]
for(j in 31:(nrow(prod)/24)){
capacity[(j-30),(i+1)]<-max(as.matrix(temp_prod[(j-30):j]$production))
}
}
cap_temp=NULL
for(j in 1:nrow(capacity)){
cap_temp=c(cap_temp,capacity[j,1:24])
}
capacity=c(rep(NA,30*24),cap_temp)
prod[,capacity:=capacity]
prod[,lag72:=shift(prod$production, 72L, NA)]
prod[,lagcap72:=shift(prod$capacity, 72L, NA)]At last, capacity values are added to the data set, since for each hour within the month, there is a maximum level that solar panel can produce up-to. Also the lagged value of 3 days both for production and capacity to the data set for modeling purposes, we could not add lag 1-2 since these dates are not provided in the resource.
3.4 Moving Average Method
daily_series[,ma_week := ma(daily_series$daily_production, order=7)]
ggplot(daily_series,aes(x=date)) +geom_line(aes(y = daily_production, color = "daily_production")) + geom_line(aes(y = ma_week, color = "weekly_ma")) + ggtitle("Moving Average Plot of Production Values")+xlab("Date")+ylab("Production")daily_series[,ma_month := ma(daily_series$daily_production, order=30)]
ggplot(daily_series,aes(x=date)) +geom_line(aes(y = daily_production, color = "daily_production")) + geom_line(aes(y = ma_month, color = "monthly_ma")) + ggtitle("Moving Average Plot of Production Values")+xlab("Date")+ylab("Production")4. Building Models
4.1 Linear Regression Approach
There will be 2 linear models will be built through this section, the first one is a general model, which is irrespective of hours based on a general trend, seasonality, lagged values and external variables. These model is likely to require manual correction since in some hours, the facility is not producing any solar power at all. The second one will be an hourly model, which is a preferred way to increase predictability of the model and in our case, has a better understanding of the hourly variations as an initial assumption. The best model of the first approach will be decided based on R^2 values and AIC.
fit1=lm(production~as.factor(hour)+as.factor(month)+m_sf+m_hum+m_cl+m_tmp+lag72,prod)
summary(fit1)##
## Call:
## lm(formula = production ~ as.factor(hour) + as.factor(month) +
## m_sf + m_hum + m_cl + m_tmp + lag72, data = prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.551 -1.629 0.205 2.063 25.363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.273e+01 4.296e+00 -5.290 1.25e-07 ***
## as.factor(hour)1 1.356e-02 3.623e-01 0.037 0.970151
## as.factor(hour)2 2.576e-02 3.624e-01 0.071 0.943330
## as.factor(hour)3 3.917e-02 3.625e-01 0.108 0.913959
## as.factor(hour)4 4.330e-02 3.626e-01 0.119 0.904965
## as.factor(hour)5 6.547e-02 3.628e-01 0.180 0.856817
## as.factor(hour)6 6.938e-01 3.631e-01 1.911 0.056086 .
## as.factor(hour)7 4.507e+00 3.696e-01 12.193 < 2e-16 ***
## as.factor(hour)8 9.898e+00 3.963e-01 24.974 < 2e-16 ***
## as.factor(hour)9 1.225e+01 4.166e-01 29.412 < 2e-16 ***
## as.factor(hour)10 9.950e+00 4.645e-01 21.422 < 2e-16 ***
## as.factor(hour)11 9.432e+00 4.835e-01 19.510 < 2e-16 ***
## as.factor(hour)12 8.886e+00 4.998e-01 17.778 < 2e-16 ***
## as.factor(hour)13 8.352e+00 5.115e-01 16.330 < 2e-16 ***
## as.factor(hour)14 7.604e+00 5.155e-01 14.752 < 2e-16 ***
## as.factor(hour)15 6.651e+00 5.123e-01 12.983 < 2e-16 ***
## as.factor(hour)16 5.351e+00 4.708e-01 11.366 < 2e-16 ***
## as.factor(hour)17 1.529e+00 4.394e-01 3.480 0.000504 ***
## as.factor(hour)18 -1.624e+00 4.165e-01 -3.900 9.69e-05 ***
## as.factor(hour)19 -2.412e+00 3.994e-01 -6.038 1.61e-09 ***
## as.factor(hour)20 -2.062e+00 3.875e-01 -5.322 1.05e-07 ***
## as.factor(hour)21 -1.719e+00 3.803e-01 -4.521 6.22e-06 ***
## as.factor(hour)22 -8.047e-02 3.625e-01 -0.222 0.824326
## as.factor(hour)23 -6.206e-02 3.624e-01 -0.171 0.864036
## as.factor(month)2 -3.548e-01 2.699e-01 -1.315 0.188675
## as.factor(month)3 2.521e-02 2.647e-01 0.095 0.924123
## as.factor(month)4 8.181e-01 3.062e-01 2.672 0.007553 **
## as.factor(month)5 1.509e+00 3.454e-01 4.369 1.26e-05 ***
## as.factor(month)6 2.227e+00 3.892e-01 5.720 1.09e-08 ***
## as.factor(month)7 2.245e+00 4.409e-01 5.093 3.58e-07 ***
## as.factor(month)8 2.308e+00 4.503e-01 5.125 3.03e-07 ***
## as.factor(month)9 1.933e+00 3.953e-01 4.891 1.02e-06 ***
## as.factor(month)10 1.887e+00 3.564e-01 5.295 1.21e-07 ***
## as.factor(month)11 4.991e-01 3.306e-01 1.510 0.131132
## as.factor(month)12 -9.900e-01 3.008e-01 -3.292 0.000999 ***
## m_sf 1.733e-02 3.268e-03 5.302 1.16e-07 ***
## m_hum 7.974e-03 8.086e-04 9.861 < 2e-16 ***
## m_cl 5.478e-03 5.288e-03 1.036 0.300298
## m_tmp 8.233e-02 1.715e-02 4.801 1.60e-06 ***
## lag72 5.194e-01 7.918e-03 65.597 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.612 on 11480 degrees of freedom
## (144 observations deleted due to missingness)
## Multiple R-squared: 0.8491, Adjusted R-squared: 0.8486
## F-statistic: 1656 on 39 and 11480 DF, p-value: < 2.2e-16
AIC(fit1)## [1] 72477.89
fit2=lm(sqrt(production)~as.factor(hour)+as.factor(month)+m_sf+m_hum+m_cl+m_tmp+lag72,prod)
summary(fit2)##
## Call:
## lm(formula = sqrt(production) ~ as.factor(hour) + as.factor(month) +
## m_sf + m_hum + m_cl + m_tmp + lag72, data = prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5464 -0.2822 0.0085 0.3475 3.1261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.6891078 0.5948894 -6.201 5.79e-10 ***
## as.factor(hour)1 0.0026898 0.0501647 0.054 0.957239
## as.factor(hour)2 0.0051999 0.0501737 0.104 0.917459
## as.factor(hour)3 0.0080310 0.0501898 0.160 0.872874
## as.factor(hour)4 0.0098118 0.0502113 0.195 0.845075
## as.factor(hour)5 0.0910330 0.0502398 1.812 0.070017 .
## as.factor(hour)6 0.6794833 0.0502787 13.514 < 2e-16 ***
## as.factor(hour)7 1.8409219 0.0511786 35.971 < 2e-16 ***
## as.factor(hour)8 2.8081603 0.0548736 51.175 < 2e-16 ***
## as.factor(hour)9 3.1291155 0.0576760 54.253 < 2e-16 ***
## as.factor(hour)10 2.9390628 0.0643120 45.700 < 2e-16 ***
## as.factor(hour)11 2.9208800 0.0669415 43.633 < 2e-16 ***
## as.factor(hour)12 2.8894030 0.0692095 41.749 < 2e-16 ***
## as.factor(hour)13 2.8334353 0.0708168 40.011 < 2e-16 ***
## as.factor(hour)14 2.7527034 0.0713712 38.569 < 2e-16 ***
## as.factor(hour)15 2.6364547 0.0709363 37.167 < 2e-16 ***
## as.factor(hour)16 2.4725047 0.0651861 37.930 < 2e-16 ***
## as.factor(hour)17 1.8341520 0.0608460 30.144 < 2e-16 ***
## as.factor(hour)18 0.8174765 0.0576654 14.176 < 2e-16 ***
## as.factor(hour)19 0.0598232 0.0553037 1.082 0.279399
## as.factor(hour)20 -0.2054355 0.0536493 -3.829 0.000129 ***
## as.factor(hour)21 -0.1720092 0.0526535 -3.267 0.001091 **
## as.factor(hour)22 -0.0126516 0.0501892 -0.252 0.800985
## as.factor(hour)23 -0.0095389 0.0501802 -0.190 0.849240
## as.factor(month)2 0.0214271 0.0373680 0.573 0.566380
## as.factor(month)3 0.1941191 0.0366471 5.297 1.20e-07 ***
## as.factor(month)4 0.3631194 0.0423936 8.565 < 2e-16 ***
## as.factor(month)5 0.4829073 0.0478182 10.099 < 2e-16 ***
## as.factor(month)6 0.5391721 0.0538956 10.004 < 2e-16 ***
## as.factor(month)7 0.4907044 0.0610404 8.039 9.94e-16 ***
## as.factor(month)8 0.4433350 0.0623518 7.110 1.23e-12 ***
## as.factor(month)9 0.3464028 0.0547333 6.329 2.56e-10 ***
## as.factor(month)10 0.2841581 0.0493467 5.758 8.71e-09 ***
## as.factor(month)11 0.0777981 0.0457761 1.700 0.089245 .
## as.factor(month)12 -0.1659835 0.0416432 -3.986 6.77e-05 ***
## m_sf 0.0024661 0.0004525 5.450 5.15e-08 ***
## m_hum 0.0009617 0.0001120 8.590 < 2e-16 ***
## m_cl -0.0011141 0.0007323 -1.522 0.128154
## m_tmp 0.0133222 0.0023741 5.611 2.05e-08 ***
## lag72 0.0640073 0.0010964 58.380 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7771 on 11480 degrees of freedom
## (144 observations deleted due to missingness)
## Multiple R-squared: 0.9016, Adjusted R-squared: 0.9013
## F-statistic: 2698 on 39 and 11480 DF, p-value: < 2.2e-16
AIC(fit2)## [1] 26923.92
fit3=lm(sqrt(production)~as.factor(hour)+as.factor(month)+m_cl+m_tmp+lag72,prod)
summary(fit3)##
## Call:
## lm(formula = sqrt(production) ~ as.factor(hour) + as.factor(month) +
## m_cl + m_tmp + lag72, data = prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5553 -0.3029 0.0187 0.3655 3.1014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.7883263 0.5950948 -6.366 2.02e-10 ***
## as.factor(hour)1 0.0020267 0.0503224 0.040 0.9679
## as.factor(hour)2 0.0039672 0.0503311 0.079 0.9372
## as.factor(hour)3 0.0057273 0.0503464 0.114 0.9094
## as.factor(hour)4 0.0064901 0.0503664 0.129 0.8975
## as.factor(hour)5 0.0853587 0.0503919 1.694 0.0903 .
## as.factor(hour)6 0.6703317 0.0504229 13.294 < 2e-16 ***
## as.factor(hour)7 1.8265644 0.0513008 35.605 < 2e-16 ***
## as.factor(hour)8 2.7987923 0.0550071 50.881 < 2e-16 ***
## as.factor(hour)9 3.1444664 0.0578063 54.397 < 2e-16 ***
## as.factor(hour)10 3.1127048 0.0610145 51.016 < 2e-16 ***
## as.factor(hour)11 3.1337146 0.0620458 50.506 < 2e-16 ***
## as.factor(hour)12 3.1320112 0.0629540 49.751 < 2e-16 ***
## as.factor(hour)13 3.0965975 0.0635569 48.722 < 2e-16 ***
## as.factor(hour)14 3.0265649 0.0635374 47.634 < 2e-16 ***
## as.factor(hour)15 2.9132784 0.0628803 46.331 < 2e-16 ***
## as.factor(hour)16 2.7162833 0.0586814 46.289 < 2e-16 ***
## as.factor(hour)17 2.0545852 0.0551904 37.227 < 2e-16 ***
## as.factor(hour)18 1.0047587 0.0534405 18.801 < 2e-16 ***
## as.factor(hour)19 0.2054776 0.0527348 3.896 9.82e-05 ***
## as.factor(hour)20 -0.0948761 0.0522082 -1.817 0.0692 .
## as.factor(hour)21 -0.0798113 0.0516923 -1.544 0.1226
## as.factor(hour)22 -0.0045432 0.0503305 -0.090 0.9281
## as.factor(hour)23 -0.0019946 0.0503222 -0.040 0.9684
## as.factor(month)2 0.0234203 0.0369162 0.634 0.5258
## as.factor(month)3 0.2310506 0.0360083 6.417 1.45e-10 ***
## as.factor(month)4 0.4546296 0.0391804 11.604 < 2e-16 ***
## as.factor(month)5 0.6032663 0.0434650 13.879 < 2e-16 ***
## as.factor(month)6 0.6473160 0.0500720 12.928 < 2e-16 ***
## as.factor(month)7 0.6259965 0.0569603 10.990 < 2e-16 ***
## as.factor(month)8 0.5836417 0.0582418 10.021 < 2e-16 ***
## as.factor(month)9 0.4309642 0.0519825 8.291 < 2e-16 ***
## as.factor(month)10 0.3523836 0.0471524 7.473 8.39e-14 ***
## as.factor(month)11 0.1052301 0.0449712 2.340 0.0193 *
## as.factor(month)12 -0.1824171 0.0415807 -4.387 1.16e-05 ***
## m_cl 0.0031451 0.0005342 5.887 4.03e-09 ***
## m_tmp 0.0127089 0.0023719 5.358 8.57e-08 ***
## lag72 0.0657584 0.0010777 61.018 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7795 on 11482 degrees of freedom
## (144 observations deleted due to missingness)
## Multiple R-squared: 0.901, Adjusted R-squared: 0.9007
## F-statistic: 2824 on 37 and 11482 DF, p-value: < 2.2e-16
AIC(fit3)## [1] 26994.3
fit4=lm(sqrt(production)~as.factor(hour)+as.factor(month)+m_cl+m_tmp+sqrt(lag72)+lagcap72,prod)
summary(fit4)##
## Call:
## lm(formula = sqrt(production) ~ as.factor(hour) + as.factor(month) +
## m_cl + m_tmp + sqrt(lag72) + lagcap72, data = prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5970 -0.1814 0.0254 0.3423 2.5319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.4744786 0.5524178 -6.290 3.31e-10 ***
## as.factor(hour)1 0.0014969 0.0458253 0.033 0.973942
## as.factor(hour)2 0.0029132 0.0458335 0.064 0.949322
## as.factor(hour)3 0.0040516 0.0458480 0.088 0.929585
## as.factor(hour)4 0.0041121 0.0458667 0.090 0.928564
## as.factor(hour)5 0.0661014 0.0458965 1.440 0.149832
## as.factor(hour)6 0.4271414 0.0462871 9.228 < 2e-16 ***
## as.factor(hour)7 0.8282281 0.0508426 16.290 < 2e-16 ***
## as.factor(hour)8 0.8137141 0.0630039 12.915 < 2e-16 ***
## as.factor(hour)9 0.7808307 0.0698604 11.177 < 2e-16 ***
## as.factor(hour)10 0.6833025 0.0732642 9.327 < 2e-16 ***
## as.factor(hour)11 0.7194662 0.0738323 9.745 < 2e-16 ***
## as.factor(hour)12 0.7108004 0.0744046 9.553 < 2e-16 ***
## as.factor(hour)13 0.6649053 0.0749035 8.877 < 2e-16 ***
## as.factor(hour)14 0.5487269 0.0750580 7.311 2.85e-13 ***
## as.factor(hour)15 0.4287194 0.0740950 5.786 7.41e-09 ***
## as.factor(hour)16 0.4870226 0.0672908 7.238 4.88e-13 ***
## as.factor(hour)17 0.6615598 0.0559382 11.827 < 2e-16 ***
## as.factor(hour)18 0.5123022 0.0496352 10.321 < 2e-16 ***
## as.factor(hour)19 0.0784216 0.0480523 1.632 0.102708
## as.factor(hour)20 -0.1180200 0.0475542 -2.482 0.013087 *
## as.factor(hour)21 -0.0960450 0.0470811 -2.040 0.041376 *
## as.factor(hour)22 -0.0037228 0.0458332 -0.081 0.935264
## as.factor(hour)23 -0.0014839 0.0458252 -0.032 0.974168
## as.factor(month)2 0.1275427 0.0373998 3.410 0.000651 ***
## as.factor(month)3 0.3008294 0.0325805 9.233 < 2e-16 ***
## as.factor(month)4 0.4110397 0.0351014 11.710 < 2e-16 ***
## as.factor(month)5 0.4389824 0.0392742 11.177 < 2e-16 ***
## as.factor(month)6 0.3358973 0.0454307 7.394 1.54e-13 ***
## as.factor(month)7 0.3328620 0.0517240 6.435 1.28e-10 ***
## as.factor(month)8 0.3111757 0.0527221 5.902 3.70e-09 ***
## as.factor(month)9 0.1879966 0.0468333 4.014 6.01e-05 ***
## as.factor(month)10 0.1571445 0.0420855 3.734 0.000189 ***
## as.factor(month)11 -0.0775350 0.0400149 -1.938 0.052692 .
## as.factor(month)12 -0.2656401 0.0367417 -7.230 5.16e-13 ***
## m_cl 0.0038370 0.0004806 7.984 1.56e-15 ***
## m_tmp 0.0117542 0.0022000 5.343 9.33e-08 ***
## sqrt(lag72) 0.2160593 0.0096003 22.506 < 2e-16 ***
## lagcap72 0.0920279 0.0019330 47.608 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6873 on 10761 degrees of freedom
## (864 observations deleted due to missingness)
## Multiple R-squared: 0.9253, Adjusted R-squared: 0.9251
## F-statistic: 3509 on 38 and 10761 DF, p-value: < 2.2e-16
AIC(fit4)## [1] 22591.54
checkresiduals(fit4)##
## Breusch-Godfrey test for serial correlation of order up to 42
##
## data: Residuals
## LM test = 6768.8, df = 42, p-value < 2.2e-16
prod[,lm4_pred:=(predict(fit4,prod))^2]
daily_pr<-prod[,list(daily_production = mean(production),daily_prediction = mean(lm4_pred)),by= list(date)]
monthly_pr<-prod[,list(monthly_production = mean(production),monthly_prediction = mean(lm4_pred),
date = as.Date(paste(year,"-",month,"-01",sep=""), format='%Y-%m-%d')),by= list(month,year)]
ggplot(prod,aes(x=date)) +geom_line(aes(y = production, color = "production")) + geom_line(aes(y = (predict(fit4,prod))^2, color = "prediction")) +ggtitle("Hourly Production vs Predicted Values")+xlab("Date")+ylab("Production")ggplot(daily_pr,aes(x=date)) +geom_line(aes(y = daily_production, color = "daily_production")) + geom_line(aes(y = daily_prediction, color = "daily_prediction")) +ggtitle("Daily Production vs Predicted Values")+xlab("Date")+ylab("Production")ggplot(monthly_pr,aes(x=date)) +geom_line(aes(y =monthly_production , color = "monthly_production")) + geom_line(aes(y = monthly_prediction, color = "monthly_prediction")) +ggtitle("Monthly Production vs Predicted Values")+xlab("Date")+ylab("Production")#0,1,2,3,4,5,20,21,22,23 always zeroFirst model is built based on external variables like cloud layer, temperature, lagged values of capacity and production, seasonality and hourly seasonality. It has the lowest value of AIC and sufficient value of R^2 to represent our dataset. Square root is used to deal with the increasing variance of the dataset, all predictors look significant so this is the winning model for our first approach of linear models.
4.2 Hourly Linear Regression Model
# identify # of days to forecast
todays_date=as.Date("2022-06-06")
forecast_date=todays_date+1
latest_available_prod_date=as.Date(max(prod2$date))
n_days=as.numeric(forecast_date-latest_available_prod_date)
forecasted_production=tail(prod2,n_days*24)
forecasted_production[,date:=date+n_days]
forecasted_production[,production:=NA]
# actual production data with forecasted dates
production_with_forecast=rbind(prod2,forecasted_production)
mdata <- melt(production_with_forecast, id=c("date","production"))
production_in_hours <- data.table()
info_for_hour1 <- mdata[value == 1]
production_in_hours[,date:=info_for_hour1$date]
hours <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)
for(i in hours){
colname = paste("V", i, sep = "")
colname
b <- mdata[value == i]
production_in_hours[,as.character(colname) := b$production]
}
weatherup <- data.table()
weatherup[,date:=production_with_forecast$date]
weatherup[,hour:=production_with_forecast$hour]
weatherup[,m_sf:=x1]
weatherup[,m_hum:=x2]
weatherup[,m_cl:=x3]
weatherup[,m_tmp:=x4]
daily_weather <- weatherup[,list(daily_temp = mean(m_tmp),daily_hum = mean(m_hum),daily_sf = mean(m_sf),daily_cl = mean(m_cl)),by= list(date)]
prod_weather = merge(production_in_hours, daily_weather,by="date")
## example for lm model for hour 8
hour8_prod <- data.table()
hour8_prod[,date:=prod_weather$date]
hour8_prod[,production:=prod_weather$V8]
hour8_prod[,monthFactor := as.factor(format(hour8_prod$date, "%m"))]
hour8_prod[,trend := 1: .N]
# model with trend
Model_trend<- lm(production ~ trend + monthFactor, hour8_prod)
summary(Model_trend)##
## Call:
## lm(formula = production ~ trend + monthFactor, data = hour8_prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.1459 -2.3719 0.7116 3.1244 12.7317
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.782727 1.368016 -2.765 0.005914 **
## trend 0.020321 0.002158 9.415 < 2e-16 ***
## monthFactor02 4.937278 1.453770 3.396 0.000741 ***
## monthFactor03 13.878178 1.415051 9.808 < 2e-16 ***
## monthFactor04 20.704592 1.411584 14.668 < 2e-16 ***
## monthFactor05 28.500032 1.396556 20.407 < 2e-16 ***
## monthFactor06 37.074909 1.604512 23.107 < 2e-16 ***
## monthFactor07 37.960316 1.643163 23.102 < 2e-16 ***
## monthFactor08 34.778986 1.669303 20.834 < 2e-16 ***
## monthFactor09 32.762496 1.630611 20.092 < 2e-16 ***
## monthFactor10 23.220076 1.608612 14.435 < 2e-16 ***
## monthFactor11 8.204412 1.614675 5.081 5.42e-07 ***
## monthFactor12 0.190950 1.597889 0.120 0.904929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.18 on 470 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.8086, Adjusted R-squared: 0.8037
## F-statistic: 165.4 on 12 and 470 DF, p-value: < 2.2e-16
hour8_prod[,trend_const := predict(Model_trend,hour8_prod)]
ggplot(hour8_prod[25:476],aes(x=date)) +
geom_line(aes(y=production, color="real")) +
geom_line(aes(y=trend_const, color="trend_const")) +
ggtitle("At hour 8 Production Values vs. Trend")+xlab("Date")+ylab("Production")acf(hour8_prod$production[1:473] - hour8_prod$trend_const[1:473])hour8_prod[,lag3:=shift(hour8_prod$production, 3L, NA)]
hour8_prod[,ma_2 := ma(hour8_prod$production, order=2)]
hour8_prod$ma_2[483:486] = hour8_prod$ma_2[482]
Model_trend2<- lm(production ~ trend + monthFactor + lag3 +ma_2, hour8_prod)
summary(Model_trend2)##
## Call:
## lm(formula = production ~ trend + monthFactor + lag3 + ma_2,
## data = hour8_prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9203 -0.9662 -0.0702 0.9435 8.8139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.518200 0.602497 0.860 0.390184
## trend -0.002638 0.001104 -2.389 0.017289 *
## monthFactor02 -0.661085 0.646672 -1.022 0.307176
## monthFactor03 -1.905290 0.727121 -2.620 0.009072 **
## monthFactor04 -2.710790 0.845959 -3.204 0.001447 **
## monthFactor05 -3.742417 1.013034 -3.694 0.000247 ***
## monthFactor06 -4.816568 1.267780 -3.799 0.000164 ***
## monthFactor07 -4.940398 1.302663 -3.793 0.000169 ***
## monthFactor08 -4.532512 1.236919 -3.664 0.000277 ***
## monthFactor09 -4.261837 1.180884 -3.609 0.000341 ***
## monthFactor10 -2.974457 0.976310 -3.047 0.002446 **
## monthFactor11 -1.057269 0.740712 -1.427 0.154145
## monthFactor12 -0.066980 0.690397 -0.097 0.922755
## lag3 -0.030637 0.021198 -1.445 0.149050
## ma_2 1.160096 0.026524 43.737 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.669 on 465 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.9642, Adjusted R-squared: 0.9631
## F-statistic: 894.4 on 14 and 465 DF, p-value: < 2.2e-16
hour8_prod[,lm2 := predict(Model_trend2,hour8_prod)]
ggplot(hour8_prod[25:476],aes(x=date)) +
geom_line(aes(y=production, color="real")) +
geom_line(aes(y=lm2, color="trend_const")) +
ggtitle("At hour 8 Production Values vs. Trend")+xlab("Date")+ylab("Production")library(corrplot)
correlation_info = cor(prod_weather[,c(2:29)],method = "pearson", use = "pairwise.complete.obs")
ggcorrplot(correlation_info,
type = "lower",
lab = TRUE)## hum_9, m_temp8
hour8_prod[,hum := prod_weather$daily_hum]
#hour8_prod[,m_temp := production_with_weather_in_hours$m_tmp16]
model_with_regressors = lm(production ~ trend + monthFactor + hum + ma_2, hour8_prod)
summary(model_with_regressors)##
## Call:
## lm(formula = production ~ trend + monthFactor + hum + ma_2, data = hour8_prod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7420 -0.9658 -0.0683 0.9610 8.6721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.810065 0.707939 2.557 0.010879 *
## trend -0.003871 0.001090 -3.552 0.000422 ***
## monthFactor02 -0.314544 0.649562 -0.484 0.628443
## monthFactor03 -1.388012 0.740519 -1.874 0.061503 .
## monthFactor04 -1.761964 0.901595 -1.954 0.051265 .
## monthFactor05 -2.653438 1.066933 -2.487 0.013232 *
## monthFactor06 -3.975258 1.257701 -3.161 0.001676 **
## monthFactor07 -4.230136 1.271799 -3.326 0.000950 ***
## monthFactor08 -3.849864 1.210595 -3.180 0.001570 **
## monthFactor09 -3.981256 1.119388 -3.557 0.000414 ***
## monthFactor10 -2.902274 0.920532 -3.153 0.001721 **
## monthFactor11 -1.059852 0.723267 -1.465 0.143494
## monthFactor12 -0.129858 0.683307 -0.190 0.849358
## hum -0.011048 0.003521 -3.138 0.001809 **
## ma_2 1.170719 0.026052 44.938 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.642 on 467 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.9651, Adjusted R-squared: 0.964
## F-statistic: 921.8 on 14 and 467 DF, p-value: < 2.2e-16
hour8_prod[,lm_regressors := predict(model_with_regressors,hour8_prod)]
ggplot(hour8_prod[25:476],aes(x=date)) +
geom_line(aes(y=production, color="real")) +
geom_line(aes(y=lm_regressors, color="regressors")) +
ggtitle("At hour 8 Production Values vs. regressors")+xlab("Date")+ylab("Production")checkresiduals(model_with_regressors)##
## Breusch-Godfrey test for serial correlation of order up to 18
##
## data: Residuals
## LM test = 274.2, df = 18, p-value < 2.2e-16
res <- resid(model_with_regressors)
plot(fitted(model_with_regressors), res)qqnorm(res)
qqline(res)hour5_prod <- prod_weather[,c(1,7,26:29)]
hour6_prod <- prod_weather[,c(1,8,26:29)]
hour7_prod <- prod_weather[,c(1,9,26:29)]
hour8_prod <- prod_weather[,c(1,10,26:29)]
hour9_prod <- prod_weather[,c(1,11,26:29)]
hour10_prod <- prod_weather[,c(1,12,26:29)]
hour11_prod <- prod_weather[,c(1,13,26:29)]
hour12_prod <- prod_weather[,c(1,14,26:29)]
hour13_prod <- prod_weather[,c(1,15,26:29)]
hour14_prod <- prod_weather[,c(1,16,26:29)]
hour15_prod <- prod_weather[,c(1,17,26:29)]
hour16_prod <- prod_weather[,c(1,18,26:29)]
hour17_prod <- prod_weather[,c(1,19,26:29)]
hour18_prod <- prod_weather[,c(1,20,26:29)]
hour19_prod <- prod_weather[,c(1,21,26:29)]
hour20_prod <- prod_weather[,c(1,22,26:29)]
dt_list = list(hour5_prod, hour6_prod,hour7_prod,hour8_prod,hour9_prod,hour10_prod,hour11_prod,
hour12_prod,hour13_prod,hour14_prod,hour15_prod,hour16_prod,hour17_prod,
hour18_prod,hour19_prod,hour20_prod)
prod[,hourly_prediction := 0.00]
prod[,hourly_prediction := as.numeric(hourly_prediction)]
hourly_models <- list()
t = 5
for (dt in dt_list){
dt[,production:=dt[,2]]
dt[,monthFactor := as.factor(format(dt$date, "%m"))]
dt[,trend := 1: .N]
dt[,ma_2 := ma(dt$production, order=2)]
dt[,ma_2 := as.numeric(ma_2)]
dt$ma_2[483:486] = dt$ma_2[482]
#print(length(dt$ma_2))
lm_fit = lm(production ~ trend + monthFactor + daily_hum + dt$ma_2 , dt)
hourly_models[[t-4]] <- summary(lm_fit)
dt[,lm_regressors := predict(lm_fit,dt)]
dt$lm_regressors[dt$lm_regressors < 0.1] = 0
#print(dt$lm_regressors)
prod$hourly_prediction[prod$hour == t] = dt$lm_regressors
#print(t)
t = t+1
}
for(j in 1:16){
# print(paste("CheckResidual Result for ",j+4, ".00", sep = "" ))
checkresiduals(hourly_models[[j]]$residuals)
# summary(hourly_models[[j]])
plotting_res <- data.frame(Residuals = hourly_models[[j]]$residuals,
Fitted_values = prod$hourly_prediction[prod$hour == j+4][1:482])
print(ggplot(plotting_res,aes(x = Fitted_values, y = Residuals)) +
geom_point()+
ggtitle(paste("CheckResidual Result for ",j+4, ".00", sep = "" ))+
geom_smooth(method = "lm", se = FALSE)
)
}prod[,hourly_res := prod$production - prod$hourly_prediction]
acf(prod$hourly_res,na.action=na.pass)ggplot(prod[date > "2022-05-01"],aes(x=date)) +
#geom_line(aes(y=production, color="real")) +
geom_line(aes(y=hourly_res, color="hourly_prediction")) +
ggtitle("Production Values vs. hourly_prediction")+xlab("Date")+ylab("Production")plot(prod$hourly_prediction, prod$hourly_res)qqnorm(prod$hourly_res)
qqline(prod$hourly_res)forecasted_production$production = prod$hourly_prediction[11593:11664]
forecasted_production## date hour production
## 1: 2022-06-05 0 0.0000000
## 2: 2022-06-05 1 0.0000000
## 3: 2022-06-05 2 0.0000000
## 4: 2022-06-05 3 0.0000000
## 5: 2022-06-05 4 0.0000000
## 6: 2022-06-05 5 0.1603602
## 7: 2022-06-05 6 6.0547428
## 8: 2022-06-05 7 20.3449774
## 9: 2022-06-05 8 38.6506089
## 10: 2022-06-05 9 38.2513582
## 11: 2022-06-05 10 37.8479789
## 12: 2022-06-05 11 37.9759081
## 13: 2022-06-05 12 37.6337775
## 14: 2022-06-05 13 35.3458720
## 15: 2022-06-05 14 37.4087170
## 16: 2022-06-05 15 38.3806470
## 17: 2022-06-05 16 37.1460449
## 18: 2022-06-05 17 27.7372204
## 19: 2022-06-05 18 8.8585923
## 20: 2022-06-05 19 1.3215929
## 21: 2022-06-05 20 0.0000000
## 22: 2022-06-05 21 0.0000000
## 23: 2022-06-05 22 0.0000000
## 24: 2022-06-05 23 0.0000000
## 25: 2022-06-06 0 0.0000000
## 26: 2022-06-06 1 0.0000000
## 27: 2022-06-06 2 0.0000000
## 28: 2022-06-06 3 0.0000000
## 29: 2022-06-06 4 0.0000000
## 30: 2022-06-06 5 0.1604328
## 31: 2022-06-06 6 6.0574386
## 32: 2022-06-06 7 20.2916524
## 33: 2022-06-06 8 38.4924878
## 34: 2022-06-06 9 38.1544603
## 35: 2022-06-06 10 37.7455914
## 36: 2022-06-06 11 37.8689575
## 37: 2022-06-06 12 37.5795517
## 38: 2022-06-06 13 35.3208000
## 39: 2022-06-06 14 37.3934852
## 40: 2022-06-06 15 38.2998956
## 41: 2022-06-06 16 37.0397807
## 42: 2022-06-06 17 27.6277965
## 43: 2022-06-06 18 8.8256045
## 44: 2022-06-06 19 1.3204848
## 45: 2022-06-06 20 0.0000000
## 46: 2022-06-06 21 0.0000000
## 47: 2022-06-06 22 0.0000000
## 48: 2022-06-06 23 0.0000000
## 49: 2022-06-07 0 0.0000000
## 50: 2022-06-07 1 0.0000000
## 51: 2022-06-07 2 0.0000000
## 52: 2022-06-07 3 0.0000000
## 53: 2022-06-07 4 0.0000000
## 54: 2022-06-07 5 0.1604040
## 55: 2022-06-07 6 6.0561201
## 56: 2022-06-07 7 20.3132393
## 57: 2022-06-07 8 38.5553622
## 58: 2022-06-07 9 38.1877329
## 59: 2022-06-07 10 37.7802892
## 60: 2022-06-07 11 37.9059090
## 61: 2022-06-07 12 37.5948902
## 62: 2022-06-07 13 35.3231327
## 63: 2022-06-07 14 37.3921466
## 64: 2022-06-07 15 38.3265395
## 65: 2022-06-07 16 37.0787368
## 66: 2022-06-07 17 27.6710907
## 67: 2022-06-07 18 8.8386613
## 68: 2022-06-07 19 1.3208997
## 69: 2022-06-07 20 0.0000000
## 70: 2022-06-07 21 0.0000000
## 71: 2022-06-07 22 0.0000000
## 72: 2022-06-07 23 0.0000000
## date hour production
When we compare the hourly model with general linear model, it is obvious that hourly model did better in capturing variations, understanding the behavior of time series, lower pacf values and more randomly scattered residuals by visual inspection.
4.3 ARIMA Approach
# fitting data
current_date="2022-06-01"
forecast_with_arima=function(data,forecast_ahead,target_name='Production',
is_seasonal=F,is_stepwise=F,is_trace=T,is_approx=F){
command_string=sprintf('input_series=data$%s',target_name)
print(command_string)
eval(parse(text=command_string))
fitted=auto.arima(input_series,seasonal=T,
trace=T,stepwise=T)
forecasted=forecast(fitted,h=forecast_ahead)
return(list(forecast=as.numeric(forecasted$mean),model=fitted))
}
forecast_with_arima(prod, 3, 'production', T, F, T, F) ## [1] "input_series=data$production"
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : 70174.9
## ARIMA(0,1,0) with drift : 73631.24
## ARIMA(1,1,0) with drift : 70478.02
## ARIMA(0,1,1) with drift : 70390.1
## ARIMA(0,1,0) : 73629.24
## ARIMA(1,1,2) with drift : 70172.24
## ARIMA(0,1,2) with drift : 70174.03
## ARIMA(1,1,1) with drift : 70174.53
## ARIMA(1,1,3) with drift : 70174.71
## ARIMA(0,1,3) with drift : 70170.73
## ARIMA(0,1,4) with drift : 70172.24
## ARIMA(1,1,4) with drift : Inf
## ARIMA(0,1,3) : 70168.73
## ARIMA(0,1,2) : 70172.03
## ARIMA(1,1,3) : 70172.71
## ARIMA(0,1,4) : 70170.24
## ARIMA(1,1,2) : 70170.24
## ARIMA(1,1,4) : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(0,1,3) : 70172.57
##
## Best model: ARIMA(0,1,3)
## $forecast
## [1] 0.02795086 0.02795086 0.02795086
##
## $model
## Series: input_series
## ARIMA(0,1,3)
##
## Coefficients:
## ma1 ma2 ma3
## 0.5713 0.1491 0.0211
## s.e. 0.0093 0.0107 0.0092
##
## sigma^2 = 24.92: log likelihood = -35082.28
## AIC=70172.57 AICc=70172.57 BIC=70202
ARIMA models are mainly based on past errors of the target variable by analyzing the lagged values and average values, however, in our case seasonality and non-stationary series exist, so a SARIMA model to improve the automated ARIMA model with seasonal adjustments is preferred. Auto arima is used to choose the model with lowest AIC/BIC and now seasonal components will be built on it.
s_arima <- arima(prod$production, order=c(0,1,3), seasonal=c(0,1,1)) %>%
residuals() %>% ggtsdisplay()## Warning: Removed 72 rows containing missing values (geom_point).
This model has a much higher AIC value and autocorrelated errors, so it
will be used to improve the final model but not to predict on the
production data mainly. Finally, we will try to include regressors to
the ARIMA model to check if any improvement is available.
arima_with_reg<- auto.arima(prod[,"production"],
xreg=as.matrix(prod[,c("m_sf","m_cl","m_tmp", "capacity")]))
arima_with_reg## Series: prod[, "production"]
## Regression with ARIMA(5,1,0) errors
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 m_sf m_cl m_tmp
## 0.0754 -0.2004 -0.1083 -0.0701 -0.0420 -0.0213 0.0075 0.3662
## s.e. 0.0096 0.0097 0.0098 0.0096 0.0096 0.0061 0.0030 0.0528
## capacity
## 0.7503
## s.e. 0.0061
##
## sigma^2 = 11.09: log likelihood = -28846.1
## AIC=57712.2 AICc=57712.22 BIC=57785.14
checkresiduals(arima_with_reg)##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(5,1,0) errors
## Q* = 347.21, df = 3, p-value < 2.2e-16
##
## Model df: 9. Total lags used: 12
And in this step, capacity and weather regressors are added to the model along with the errors. Even though it is not the case the models performed the best, it definitely performed better with the inclusion of external variables and it is the most out of ARIMA models in our study.
5. Forecasting and Evaluation
forecast_with_lr=function(fmla, data,forecast_data){
fitted_lm=lm(as.formula(fmla),data)
forecasted=predict(fitted_lm,forecast_data)
return(list(forecast=as.numeric(forecasted),model=fitted_lm))
}
train_start=as.Date('2021-02-01')
test_start=as.Date('2022-06-04')
test_end=as.Date('2022-06-07')
test_dates=seq(test_start,test_end, by='day')
test_dates## [1] "2022-06-04" "2022-06-05" "2022-06-06" "2022-06-07"
for(i in 1:length(test_dates)){
forecast_ahead=3
current_date=test_dates[i]-forecast_ahead
past_data=prod[date<=current_date]
forecast_data=prod[date==test_dates[i]]
forecasted=forecast_with_lr(fit4, past_data, forecast_data)
forecast_data[,lm_prediction:=forecasted$forecast^2]
}
forecast_data[,residuals:=production-lm_prediction]
forecast_data$lm_prediction## [1] 0.007730207 0.008049628 0.008493849 0.009914311 0.011850200
## [6] 0.089248993 3.576570640 21.741296641 35.643952105 35.788523807
## [11] 37.371844734 38.141720648 37.754173051 37.283085404 35.934282052
## [16] 34.527751208 34.107167522 22.847270325 5.526199343 0.652958040
## [21] 0.036539284 0.036798503 0.023586722 0.025693779
res <- resid(fit4)
plot(fitted(fit4), res)qqnorm(res)
qqline(res)#0,1,2,3,4,21,22,23 always zero
#5-20 mostly zeroHere, residuals and quantile-quantile plot for the regressive model built with the general(not separated) values of production, lag, capacity and weather variables is checked. It looks like although mostly there is a good fitting with the actual values; there are still problems with the residuals(not random at some places), and lower and upper ends of the Q-Q plot is not even close to ideal. Since ARIMA models have an extremely higher AIC values when compared to linear models and not good at predicting hourly models, they are not included in the forecasting phase.
5.1 Error Comparison
error_calc=function(actual,forecast){
n=length(actual)
error=actual-forecast
mean=mean(actual)
sd=sd(actual)
CV=sd/mean
FBias=sum(error)/sum(actual)
RMSE=sqrt(sum(error^2)/n)
MAD=sum(abs(error))/n
MADP=sum(abs(error))/sum(abs(actual))
WMAPE=MAD/mean
df=data.frame(n,mean,sd,CV,FBias,RMSE,MAD,MADP,WMAPE)
return(df)
}
result_hourly_pred <- error_calc(prod[date > ymd(20220201) & date < ymd(20220521)]$production, prod[date > ymd(20220201) & date < ymd(20220521)]$hourly_prediction)
result_lm4 <- error_calc(prod[date > ymd(20220307) & date < ymd(20220521)]$production, prod[date > ymd(20220307) & date < ymd(20220521)]$lm4_pred)
result_hourly_pred## n mean sd CV FBias RMSE MAD MADP
## 1 2592 11.13176 14.28802 1.283537 -0.003203597 2.92947 1.454857 0.1306943
## WMAPE
## 1 0.1306943
result_lm4## n mean sd CV FBias RMSE MAD MADP
## 1 1776 12.49916 14.81832 1.185545 0.06457994 5.316521 3.047768 0.2438378
## WMAPE
## 1 0.2438378
Comparing the error values of the models, and considering our previous observations, it seems almost obvious that separately built model(for each hour) is a better choice for usage.
6. Results
7. Conclusions and Future Work
8. Code
Rmd codes are here